Individual UCEs

The gene trees were constructed using ETE toolkit:

Preparatory step: installing Miniconda, following the instructions from http://etetoolkit.org/download/

The gene trees were estimated using RAxML:

ete3 build -w standard_raxml -n /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/FASTA/uce-3.fasta -o /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/FASTA/uce-3-tree

Automatically generate the corresponding command for each of the 230 alignment files in the directory:

cd /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/FASTA
ls

Copy the output to a text file:

# Get the 1st part of the command

locus_table <- read.table("Set1-individual-loci.txt")
n <- length(unlist(locus_table))

inputfile <- vector()
for(i in locus_table) {
  inputfile <- append(inputfile, paste("ete3 build -w standard_raxml -n /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/FASTA/", i, sep = ""))
}

# Length check

length(inputfile) - n

# Remove file endings from the locus names

names_and_endings <- as.character(as.vector(as.matrix(locus_table)))
locus_names <- vector()
for(i in names_and_endings) {
  locus_names <- append(locus_names, sapply(strsplit(i, split='.', fixed=TRUE), function(x) (x[1])))
}

# Get the 2nd part of the command

outputfile <- vector()
for(i in locus_names) {
  outputfile <- append(outputfile, paste(" -o /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/", i, sep=""))
}

# Length check

length(outputfile) - n

# Get the entire command

commands <- paste(inputfile, outputfile, "-tree &&", sep="")

# Length check

length(commands) - n

# Uncomment to print to file:
# write(commands, "gene-tree-analysis.sh")

#!/bin/bash was then inserted into the first line of the file to convert it into a shell script. The script was run as follows:

chmod 755 /Users/David/Grive/Alfaro_Lab/SortaDate/Locus_analysis/gene-tree-analysis.sh
/Users/David/Grive/Alfaro_Lab/SortaDate/Locus_analysis/gene-tree-analysis.sh

Write a new script that will copy the trees from the nested subdirectories to the same directory where the alignment files are located:

# 1st part of the command

copyfiles <- vector()
for(i in locus_names) {
  copyfiles <- append(copyfiles, paste("cp /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/", i, sep = ""))
}

# 2nd part of the command

subdirectories <- vector()
for(i in locus_names) {
  subdirectories <- append(subdirectories, paste("-tree/clustalo_default-none-none-raxml_default/", i, sep = ""))
}

# Get the entire command

copying <- paste(copyfiles, subdirectories, ".fasta.final_tree.nw /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/FASTA/ &&", sep="")

# Uncomment to print to file:
# write(copying, "copy-trees.sh")

Write a third script to rename the tree files so that they have the .tre file ending, which SortaDate looks for while searching its target directory:

# 1st part of the command

oldname <- vector()
for(i in locus_names) {
  oldname <- append(oldname, paste("mv /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/FASTA/", i, sep = ""))
}

# 2nd part of the command

newname <- vector()
for(i in locus_names) {
  newname <- append(newname, paste(".fasta.final_tree.nw /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/FASTA/", i, sep = ""))
}

# Get the entire command:

renametrees <- paste(oldname, newname, ".tre &&", sep = "")

# Uncomment to print to file:
write(renametrees, "rename-trees.sh")

Phase 1: get_var_length

python src/get_var_length.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/FASTA/ --flend .tre --outf Locus_analysis/var-uces --outg alepisaurus_ferox,ceratoscopelus_warmingii

Phase 2: get_bp_genetrees

python src/get_bp_genetrees.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/FASTA/ /Users/David/Grive/Alfaro_Lab/Acanthomorpha/12_no_outgroups_scheme_3.tre --flend .tre --outf Locus_analysis/bp-uces

Phase 3: combine_results

python src/combine_results.py Locus_analysis/var-uces Locus_analysis/bp-uces --outf Locus_analysis/comb-uces

Phase 4: get_good_genes

In order of descending priority: bipartition support, root-to-tip variance, tree length

python src/get_good_genes.py Locus_analysis/comb-uces --max 3 --order 3,1,2 --outf Locus_analysis/gg-uces-312
name root-to-tip_var treelength bipartition
uce-1184.tre 0.00510364 5.59114 0.491379310345
uce-383.tre 0.0094896 4.68126 0.465517241379
uce-1317.tre 0.0200846 8.06488 0.465517241379

In order of descending priority: bipartition support, tree length, root-to-tip variance

python src/get_good_genes.py Locus_analysis/comb-uces --max 3 --order 3,2,1 --outf Locus_analysis/gg-uces-321
name root-to-tip_var treelength bipartition
uce-1184.tre 0.00510364 5.59114 0.491379310345
uce-383.tre 0.0094896 4.68126 0.465517241379
uce-1317.tre 0.0200846 8.06488 0.465517241379

In order of descending priority: root-to-tip variance, tree length, bipartition support

python src/get_good_genes.py Locus_analysis/comb-uces --max 3 --order 1,2,3 --outf Locus_analysis/gg-uces-123
name root-to-tip_var treelength bipartition
uce-1075.tre 7.59021e-05 0.363795 0.0689655172414
uce-446.tre 0.000230387 0.897702 0.26724137931
uce-855.tre 0.000309407 1.03296 0.301724137931

In order of descending priority: root-to-tip variance, bipartition support, tree length

python src/get_good_genes.py Locus_analysis/comb-uces --max 3 --order 1,3,2 --outf Locus_analysis/gg-uces-132
name root-to-tip_var treelength bipartition
uce-1075.tre 7.59021e-05 0.363795 0.0689655172414
uce-446.tre 0.000230387 0.897702 0.26724137931
uce-855.tre 0.000309407 1.03296 0.301724137931

In order of descending priority: tree length, root-to-tip variance, bipartition support

python src/get_good_genes.py Locus_analysis/comb-uces --max 3 --order 2,1,3 --outf Locus_analysis/gg-uces-213
name root-to-tip_var treelength bipartition
uce-1075.tre 7.59021e-05 0.363795 0.0689655172414
uce-737.tre 0.000743068 0.814482 0.137931034483
uce-446.tre 0.000230387 0.897702 0.26724137931

In order of descending priority: tree length, bipartition support, root-to-tip variance

python src/get_good_genes.py Locus_analysis/comb-uces --max 3 --order 2,3,1 --outf Locus_analysis/gg-uces-231
name root-to-tip_var treelength bipartition
uce-1075.tre 7.59021e-05 0.363795 0.0689655172414
uce-737.tre 0.000743068 0.814482 0.137931034483
uce-446.tre 0.000230387 0.897702 0.26724137931

k-means partitions

Note that the largest, slowest-evolving partition (“ccf55a6ee6d62f840a124bcc0c98ecf5”; 132 kb) was excluded from the first round of analyses for computational reasons. Attempts to analyze it in RAxML after the remaining 31 analyses finished up were unsuccessful.

# Get the 1st part of the command

kmeans_table <- read.table("Set2-kmeans-partitions.txt")
n2 <- length(unlist(kmeans_table))

first <- vector()
for(i in kmeans_table) {
  first <- append(first, paste("ete3 build -w standard_raxml -n /Users/David/Grive/Alfaro_Lab/Acanthomorpha/k-means_partitions/", i, sep = ""))
}

# Length check

length(first) - n2

# Remove file endings from the locus names

no_endings <- as.character(as.vector(as.matrix(kmeans_table)))
kmeans_names <- vector()
for(i in no_endings) {
  kmeans_names <- append(kmeans_names, sapply(strsplit(i, split='.', fixed=TRUE), function(x) (x[1])))
}
kmeans_names

# Get the 2nd part of the command

second <- vector()
for(i in kmeans_names) {
  second <- append(second, paste(" -o /Users/David/Grive/Alfaro_Lab/Acanthomorpha/k-means_partitions/", i, sep=""))
}
second

# Length check

length(second) - n2

# Get the entire command

kmeansscript <- paste(first, second, "-tree &&", sep="")

# Length check

length(kmeansscript) - n2

# Uncomment to print to file:
# write(kmeansscript, "kmeans-analysis.sh")

Copy the trees into the directory containing the alignments:

# 1st part of the command

copytrees <- vector()
for(i in kmeans_names) {
  copytrees <- append(copytrees, paste("cp /Users/David/Grive/Alfaro_Lab/Acanthomorpha/k-means_partitions/", i, sep = ""))
}

# 2nd part of the command

locations <- vector()
for(i in kmeans_names) {
  locations <- append(locations, paste("-tree/clustalo_default-none-none-raxml_default/", i, sep = ""))
}

# Get the entire command

finalstep <- paste(copytrees, locations, ".phy.fasta.final_tree.nw /Users/David/Grive/Alfaro_Lab/Acanthomorpha/k-means_partitions/FASTA/ &&", sep="")

# Uncomment to print to file:
# write(finalstep, "copy-kmeans-trees.sh")

Change the tree names so that they correspond to the partition names:

# 1st part of the command

oldtreename <- vector()
for(i in kmeans_names) {
  oldtreename <- append(oldtreename, paste("mv /Users/David/Grive/Alfaro_Lab/Acanthomorpha/k-means_partitions/FASTA/", i, sep = ""))
}

# 2nd part of the command

newtreename <- vector()
for(i in kmeans_names) {
  newtreename <- append(newtreename, paste(".phy.fasta.final_tree.nw /Users/David/Grive/Alfaro_Lab/Acanthomorpha/k-means_partitions/FASTA/", i, sep = ""))
}

# Get the entire command:

changetreenames <- paste(oldtreename, newtreename, ".tre &&", sep = "")

# Uncomment to print to file:
# write(changetreenames, "rename-kmeans-trees.sh")

Finally, rename the partitions:

# 1st part of the command

oldpartition <- vector()
for(i in kmeans_names) {
  oldpartition <- append(oldpartition, paste("mv /Users/David/Grive/Alfaro_Lab/Acanthomorpha/k-means_partitions/FASTA/", i, sep = ""))
}

# 2nd part of the command

newpartition <- vector()
for(i in kmeans_names) {
  newpartition <- append(newpartition, paste(".phy.fasta /Users/David/Grive/Alfaro_Lab/Acanthomorpha/k-means_partitions/FASTA/", i, sep = ""))
}

# Get the entire command
  
renamepartitions <- paste(oldpartition, newpartition, ".fasta &&", sep = "")

# Uncomment to print to file:
# write(renamepartitions, "rename-partitions.sh")

Phase 1: get_var_length

python src/get_var_length.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/k-means_partitions/FASTA/ --flend .tre --outf var-kmeans --outg alepisaurus_ferox,ceratoscopelus_warmingii

Phase 2: get_bp_genetrees

python src/get_bp_genetrees.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/k-means_partitions/FASTA/ /Users/David/Grive/Alfaro_Lab/Acanthomorpha/12_no_outgroups_scheme_3.tre --flend .tre --outf bp-kmeans

Phase 3: combine_results

python src/combine_results.py var-kmeans bp-kmeans --outf comb-kmeans

Phase 4: get_good_genes

In order of descending priority: bipartition support, root-to-tip variance, tree length

name root-to-tip_var treelength bipartition
8be7c94dcf2d71970c663f6710af40d7.tre 0.0878182 19.8649 0.560344827586
f495e4e0f2f9bbbf091f778067b062f4.tre 0.0230499 11.0301 0.508620689655
0061a6137d1029978876fe13239f57bc.tre 0.0205424 11.2184 0.5

In order of descending priority: bipartition support, tree length, root-to-tip variance

name root-to-tip_var treelength bipartition
8be7c94dcf2d71970c663f6710af40d7.tre 0.0878182 19.8649 0.560344827586
f495e4e0f2f9bbbf091f778067b062f4.tre 0.0230499 11.0301 0.508620689655
0061a6137d1029978876fe13239f57bc.tre 0.0205424 11.2184 0.5

In order of descending priority: root-to-tip variance, tree length, bipartition support

name root-to-tip_var treelength bipartition
4ff6dce0b7cebc9428b4b77e79356484.tre 0.000241636 1.15434 0.0258620689655
589d12bf910a9937d941aa4c836ad87d.tre 0.000935835 2.0223 0.336206896552
fe7aa5d9de0f5f51ef6365ef3b7f1e06.tre 0.00125304 2.60574 0.0603448275862

In order of descending priority: root-to-tip variance, bipartition support, tree length

name root-to-tip_var treelength bipartition
4ff6dce0b7cebc9428b4b77e79356484.tre 0.000241636 1.15434 0.0258620689655
589d12bf910a9937d941aa4c836ad87d.tre 0.000935835 2.0223 0.336206896552
fe7aa5d9de0f5f51ef6365ef3b7f1e06.tre 0.00125304 2.60574 0.0603448275862

In order of descending priority: tree length, root-to-tip variance, bipartition support

name root-to-tip_var treelength bipartition
4ff6dce0b7cebc9428b4b77e79356484.tre 0.000241636 1.15434 0.0258620689655
589d12bf910a9937d941aa4c836ad87d.tre 0.000935835 2.0223 0.336206896552
fe7aa5d9de0f5f51ef6365ef3b7f1e06.tre 0.00125304 2.60574 0.0603448275862

In order of descending priority: tree length, bipartition support, root-to-tip variance

name root-to-tip_var treelength bipartition
4ff6dce0b7cebc9428b4b77e79356484.tre 0.000241636 1.15434 0.0258620689655
589d12bf910a9937d941aa4c836ad87d.tre 0.000935835 2.0223 0.336206896552
fe7aa5d9de0f5f51ef6365ef3b7f1e06.tre 0.00125304 2.60574 0.0603448275862

50-bp Chunks

A bash script was written to automate the following actions:

  1. Create a new directory nested in /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/FASTA for each locus
  2. Split the alignments for each locus into individual sequences, and these into 50-bp chunks. This step was performed using a custom Python script obtained from http://www.reddit.com/r/bioinformatics/comments/1u8yc7/looking_for_a_script_that_will_split_dna/ceg8rav/?st=j0tbjfco&sh=1e300055.

      '''
      Splits all sequences within a multi-fasta file into chunks of a specified size.
    
      Fasta header information is retained with each split sequence - its position in
      the original is appended to the id. Single-line and multi-line fasta files are
      supported. Prints to stout, so pipe to a file to store the result.
    
      Usage:
      python splitter.py <filename> <chunksize>
      python splitter.py myfile.fa 100
      '''
    
      from __future__ import print_function
      from sys import argv, version
    
      if version[0] == '2':
          from itertools import izip_longest as zl
      else:
          from itertools import zip_longest as zl
    
      chunksize = int(argv[2])
    
      def writeseq(header, seq):
          for i, chunk in enumerate(zl(*[iter(seq)]*chunksize, fillvalue='')):
              print(header + '_{}bp'.format(i*chunksize))
              print(''.join(chunk))
    
      with open(argv[1]) as f:
          header, seq = f.readline().rstrip(), ''
          for l in f:
              if l[0] != '>':
                  seq += l.rstrip()
              else:
                  writeseq(header, seq)
                  header, seq = l.rstrip(), ''
          writeseq(header, seq)
  3. For each locus, join the individual sequences chunk-wise (i.e., make a single fasta file for all taxa and sites 0 to 50, another one for all taxa and sites 51 to 100, etc.):

    library(dplyr)
    
    # Alternating rows (name, sequence, name, sequence) go to two different columns, so that
    # each sequence is correctly assigned to its respective taxon:
    
    split_seqs <- read.table("split.txt")
    odd <- as.vector(split_seqs[seq(1, nrow(split_seqs), 2), ])
    even <- as.vector(split_seqs[seq(2, nrow(split_seqs), 2), ])
    odd_name <- "taxon"
    even_name <- "sequence"
    split_seqs_new <- data.frame(odd, even)
    names(split_seqs_new) <- c(odd_name, even_name)
    
    # Determine how long the locus is (i.e., how many 50-bp chunks it has been split into).
    # This can be done by counting the number of occurrences of a single taxon name. 
    # In principle, any name could be used, but since not all of the UCEs include
    # all of the taxa, it is advisable to choose a taxon common to all the loci.
    
    n <- length(unique(grep("chaetodon_kleinii", split_seqs_new[,1], value = TRUE)))
    
    # Create a vector of strings that can filter taxon names according to the base pair range
    # tag attached to their end
    
    chunks <- vector()
    for(i in 0:(n-1)) {
      chunks <- append(chunks, paste("_", i*50, "bp", sep = ""))
    }
    
    # Create a list of data frames. Each element of the list represents a base pair range
    # and consists of a data frame containing both the "taxon" and "sequence" columns of
    # split_seqs_new
    
    partition <- list()
    for(i in 1:length(chunks)) {
      partition[[i]] <- data.frame(filter(split_seqs_new, 
                                          grepl(as.character(chunks[i]), taxon)))
    }
    
    # Create a matrix whose columns represents individual chunks (i.e., base pair ranges)
    # and whose rows have the structure of the original split_seqs data frame -- i.e., name,
    # sequence, name, sequence:
    
    chunkmatrix <- matrix(ncol = length(partition), 
                          nrow = 2*(nrow(split_seqs_new)/length(partition)))
    for(i in 1:length(partition)) {
      for(j in 1:nrow(partition[[i]])) {
        chunkmatrix[(2*j - 1), i] <- as.character(partition[[i]][j, "taxon"])
        chunkmatrix[2*j, i] <- as.character(partition[[i]][j, "sequence"])
      }
    }
    
    # Print the resulting fasta files!
    
    for(i in 1:ncol(chunkmatrix)) {
      write(chunkmatrix[,i], paste("chunk_", i, ".fasta", sep = ""))
    }
  4. Add a locus-indicating prefix to all the chunks of a given UCE:

    find *.fasta -maxdepth 0 ! -path . -exec mv {} uce-1005_{} \;
  5. Copy the resulting fasta files into a single directory.

The contents of the directory were then summarized as follows:

cd /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks/
ls > /Users/David/Grive/Alfaro_Lab/SortaDate/Set3-50bp-chunks.txt

Now, change the taxon names in the chunk FASTA files so that they correspond to the names in the reference tree. First, create a file with the names of all the chunk files:

cd /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks/
ls *.fasta > /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks/chunklist.txt
a <- read.table("/Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks/chunklist.txt")

x <- vector()
for(i in 1:nrow(a)) {
  x <- append(x, paste("/Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks/", a[i,], sep = ""))
}

for(i in 1:length(x)) {
  c <- read.table(print(x[i]), stringsAsFactors = FALSE)
  d <- vector()
  for(j in 1:(nrow(c)/2)) {
    d[j] <- as.character(c[(2*j-1),])
  }
  
  e <- vector()
  for(j in 1:length(d)) {
    e[j] <- paste(sapply(strsplit(d[j], split="_", fixed=TRUE), function(x) (x[1])), 
                  "_", 
                  sapply(strsplit(d[j], split="_", fixed=TRUE), function(x) (x[2])),
                  sep = "")
  }
  
  for(j in 1:(nrow(c)/2)) {
    c[(2*j-1),] <- e[j]
  }
  write(as.matrix(c), print(x[i]), ncolumns=1)
}

A script was generated to analyze all of the alignment in the directory using RAxML:

chunk_table <- read.table("Set3-50bp-chunks.txt")

newfolders <- vector()
for(i in chunk_table) {
  newfolders <- append(newfolders, paste("ete3 build -w standard_raxml -n /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks/", i, sep=""))
}

with_endings <- as.character(as.vector(as.matrix(chunk_table)))
chunk_names <- vector()
for(i in with_endings) {
  chunk_names <- append(chunk_names, sapply(strsplit(i, split='.', fixed=TRUE), function(x) (x[1])))
}

tree_location <- vector()
for(i in chunk_names) {
  tree_location <- append(tree_location, paste(" -o /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/", i, sep=""))
}

analyzechunks <- paste(newfolders, tree_location, sep="")
write(analyzechunks, "chunk-analysis.sh")

The resulting tree files were then copied into the directory containing the alignments:

copyfrom1 <- vector()
for(i in chunk_names) {
  copyfrom1 <- append(copyfrom1, paste("cp /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/", i, sep = ""))
}

copyfrom2 <- vector()
for(i in chunk_names) {
  copyfrom2 <- append(copyfrom2, paste("/clustalo_default-none-none-raxml_default/", i, sep = ""))
}

copyto <- paste(copyfrom1, copyfrom2, ".fasta.final_tree.nw /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks/", sep="")

write(copyto, "copy-chunk-trees.sh")

Rename the trees:

brew install rename
rename -S .fasta.final_tree.nw .tre *.fasta.final_tree.nw

Running ls *.fasta | wc -l and ls *.nw | wc -l in the directory showed that out of 1826 chunk FASTA files, no more than 1070 had tree files associated with them.

Phase 1: get_var_length

python src/get_var_length.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks/ --flend .tre --outf Chunk_analysis/var-chunks --outg alepisaurus_ferox,ceratoscopelus_warmingii

Phase 2: get_bp_genetrees

python src/get_bp_genetrees.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks/ /Users/David/Grive/Alfaro_Lab/Acanthomorpha/12_no_outgroups_scheme_3.tre --flend .tre --outf Chunk_analysis/bp-chunks

Phase 3: combine_results

python src/combine_results.py Chunk_analysis/var-chunks Chunk_analysis/bp-chunks --outf Chunk_analysis/comb-chunks

Calculate the Robinson-Foulds distances of the individual chunk trees from the reference tree using the Python script below:

import os, uuid
from ete3 import Tree

t2 = Tree("/Users/David/Grive/Alfaro_Lab/Acanthomorpha/12_no_outgroups_scheme_3.tre")
for file in os.listdir("/Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks/"):
    if file.endswith(".tre"):
        t1 = Tree(file)
        try:
            rf = t1.robinson_foulds(t2)
            print str(file), (rf[0])
        except:
            pass

Step 1 of filtering: remove trees of zero length

No effect: the minimum observed tree length is non-zero and equal to:

## [1] 8.80081e-05

Phase 4: get_good_genes

In order of descending priority: bipartition support, root-to-tip variance, tree length

name root-to-tip_var treelength bipartition
uce-157_chunk_7.tre 0.176994 14.918 0.258620689655
uce-126_chunk_3.tre 0.104056 14.0824 0.224137931034
uce-323_chunk_2.tre 1.16412 48.0104 0.224137931034
## [1] "The Robinson-Foulds distances of the three bets chunks from the reference tree are as follows:"
##                   chunk  rf
## 321 uce-157_chunk_7.tre 164
##                   chunk  rf
## 219 uce-126_chunk_3.tre 166
##                   chunk  rf
## 443 uce-323_chunk_2.tre 168

The chunk with the minimum RF distance from the reference tree:

##                   chunk  rf
## 321 uce-157_chunk_7.tre 164

In order of descending priority: bipartition support, tree length, root-to-tip variance

name root-to-tip_var treelength bipartition
uce-157_chunk_7.tre 0.176994 14.918 0.258620689655
uce-126_chunk_3.tre 0.104056 14.0824 0.224137931034
uce-323_chunk_2.tre 1.16412 48.0104 0.224137931034

In order of descending priority: root-to-tip variance, tree length, bipartition support

name root-to-tip_var treelength bipartition
uce-1243_chunk_4.tre 1.05801e-11 0.0215722 0.0
uce-600_chunk_3.tre 1.28251e-11 0.0205929 0.00862068965517
uce-737_chunk_4.tre 2.16826e-11 0.0215345 0.0

In order of descending priority: root-to-tip variance, bipartition support, tree length

name root-to-tip_var treelength bipartition
uce-1243_chunk_4.tre 1.05801e-11 0.0215722 0.0
uce-600_chunk_3.tre 1.28251e-11 0.0205929 0.00862068965517
uce-737_chunk_4.tre 2.16826e-11 0.0215345 0.0

In order of descending priority: tree length, root-to-tip variance, bipartition support

name root-to-tip_var treelength bipartition
uce-855_chunk_6.tre 3.60737e-11 8.80081e-05 0.0
uce-413_chunk_4.tre 3.51022e-06 0.0205344 0.0
uce-600_chunk_3.tre 1.28251e-11 0.0205929 0.00862068965517

In order of descending priority: tree length, bipartition support, root-to-tip variance

name root-to-tip_var treelength bipartition
uce-855_chunk_6.tre 3.60737e-11 8.80081e-05 0.0
uce-413_chunk_4.tre 3.51022e-06 0.0205344 0.0
uce-600_chunk_3.tre 1.28251e-11 0.0205929 0.00862068965517

50-bp chunks: SH-like of 75% or above

  1. Copy all the tree files to a new directory:

    cd /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa
    mkdir Chunks_75
    cp Chunks/*.tre Chunks_75
  2. Collapse all the nodes with SH-like support values of less than 75% using the following Python script:

    import os, uuid
    from ete3 import Tree
    
    for file in os.listdir("/Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks"):
        if file.endswith(".tre"):
            outname = "/Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks_75/" + str(file)
            t = Tree(file, format=0)
    
            print t.get_ascii(attributes=['support', 'name'])
    
            for node in t.get_descendants():
                if not node.is_leaf() and node.support <= 0.75:
                    node.delete()
    
            print t.get_ascii(attributes=['support', 'name'])
    
            t.write(format=0, outfile=outname)
  3. Copy the fasta alignments to the same directory:

    cp Chunks/*.fasta Chunks_75

Phase 1: get_var_length

python src/get_var_length.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks_75/ --flend .tre --outf Chunk75_analysis/var-chunks75 --outg alepisaurus_ferox,ceratoscopelus_warmingii

Phase 2: get_bp_genetrees

python src/get_bp_genetrees.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks_75/ /Users/David/Grive/Alfaro_Lab/Acanthomorpha/12_no_outgroups_scheme_3.tre --flend .tre --outf Chunk75_analysis/bp-chunks75

Phase 3: combine_results

python src/combine_results.py Chunk75_analysis/var-chunks75 Chunk75_analysis/bp-chunks75 --outf Chunk75_analysis/comb-chunks75

A script was written to delete all lines containing NAs, as well as all lines that only contained an entry for bipartition support but none for root-to-tip branch length variance or tree length. This was accomplished by filling these partially empty lines with NAs in the first step and deleting them in the second step:

combchunks75 <- read.table("/Users/David/Grive/Alfaro_Lab/SortaDate/Chunk75_analysis/comb-chunks75", fill = TRUE)
filtered <- na.omit(combchunks75)
write.table(filtered,
      "/Users/David/Grive/Alfaro_Lab/SortaDate/Chunk75_analysis/comb-chunks75-filtered",
      sep = "\t",
      quote = FALSE,
      row.names = FALSE,
      col.names = FALSE)

The length of the filtered comb file is 901 lines, meaning that SortaDate failed to obtain root-to-tip variance and/or tree length for 169 trees.

Step 1 of filtering: remove trees of zero length

No effect: the minimum observed tree length is non-zero and equal to:

## [1] 0.0202802

Note that this result is orders of magnitude larger than those observed in the other five chunk datasets.

Phase 4: get_good_genes

In order of descending priority: bipartition support, root-to-tip variance, tree length

name root-to-tip_var treelength bipartition
uce-157_chunk_7.tre 0.163848 13.508 0.198275862069
uce-126_chunk_3.tre 0.101031 13.9331 0.181034482759
uce-323_chunk_2.tre 0.459664 40.9379 0.146551724138
## [1] "The Robinson-Foulds distances of the three best chunks from the reference tree are as follows:"
##                   chunk  rf
## 284 uce-157_chunk_7.tre 126
##                   chunk  rf
## 194 uce-126_chunk_3.tre 114
##                   chunk  rf
## 391 uce-323_chunk_2.tre 125

The chunk with the minimum RF distance from the reference tree:

##                    chunk  rf
## 189 uce-1263_chunk_2.tre 106

In order of descending priority: bipartition support, tree length, root-to-tip variance

name root-to-tip_var treelength bipartition
uce-157_chunk_7.tre 0.163848 13.508 0.198275862069
uce-126_chunk_3.tre 0.101031 13.9331 0.181034482759
uce-323_chunk_2.tre 0.459664 40.9379 0.146551724138

In order of descending priority: root-to-tip variance, tree length, bipartition support

name root-to-tip_var treelength bipartition
uce-815_chunk_8.tre 0.0 0.0202802 0.0
uce-359_chunk_7.tre 0.0 0.0203467 0.00862068965517
uce-160_chunk_3.tre 0.0 0.0204413 0.0

In order of descending priority: root-to-tip variance, bipartition support, tree length

name root-to-tip_var treelength bipartition
uce-200_chunk_2.tre 0.0 0.0317025 0.0344827586207
uce-129_chunk_5.tre 0.0 0.0211935 0.0258620689655
uce-1062_chunk_5.tre 0.0 0.113443 0.0258620689655

In order of descending priority: tree length, root-to-tip variance, bipartition support

name root-to-tip_var treelength bipartition
uce-815_chunk_8.tre 0.0 0.0202802 0.0
uce-359_chunk_7.tre 0.0 0.0203467 0.00862068965517
uce-160_chunk_3.tre 0.0 0.0204413 0.0

In order of descending priority: tree length, bipartition support, root-to-tip variance

name root-to-tip_var treelength bipartition
uce-815_chunk_8.tre 0.0 0.0202802 0.0
uce-359_chunk_7.tre 0.0 0.0203467 0.00862068965517
uce-160_chunk_3.tre 0.0 0.0204413 0.0

50-bp chunks: SH-like of 90% or above

(The first three steps were identical to those described above.)

Phase 1: get_var_length

python src/get_var_length.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks_90/ --flend .tre --outf Chunk90_analysis/var-chunks90 --outg alepisaurus_ferox,ceratoscopelus_warmingii

Phase 2: get_bp_genetrees

python src/get_bp_genetrees.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-114-taxa/Chunks_90/ /Users/David/Grive/Alfaro_Lab/Acanthomorpha/12_no_outgroups_scheme_3.tre --flend .tre --outf Chunk90_analysis/bp-chunks90

Phase 3: combine_results

python src/combine_results.py Chunk90_analysis/var-chunks90 Chunk90_analysis/bp-chunks90 --outf Chunk90_analysis/comb-chunks90

Delete the lines with NAs:

combchunks90 <- read.table("/Users/David/Grive/Alfaro_Lab/SortaDate/Chunk90_analysis/comb-chunks90", fill = TRUE)
filtered <- na.omit(combchunks90)
write.table(filtered,
      "/Users/David/Grive/Alfaro_Lab/SortaDate/comb-chunks90-filtered",
      sep = "\t",
      quote = FALSE,
      row.names = FALSE,
      col.names = FALSE)

The length of the filtered comb file is 536 lines, meaning that SortaDate failed to obtain root-to-tip variance and/or tree length for 534 (almost 50%) trees.

Step 1 of filtering: remove trees of zero length

No effect: the minimum observed tree length is non-zero and equal to:

## [1] 8.71304e-07

Phase 4: get_good_genes

In order of descending priority: bipartition support, root-to-tip variance, tree length

name root-to-tip_var treelength bipartition
uce-1317_chunk_8.tre 0.0166339 15.2225 0.0689655172414
uce-120_chunk_6.tre 0.187599 4.6936 0.0689655172414
uce-323_chunk_2.tre 0.325194 35.4206 0.0689655172414
## [1] "The Robinson-Foulds distances of the three best chunks from the reference tree are as follows:"
##                    chunk  rf
## 159 uce-1317_chunk_8.tre 111
##                  chunk  rf
## 81 uce-120_chunk_6.tre 114
##                   chunk  rf
## 248 uce-323_chunk_2.tre 107

The chunk with the minimum RF distance from the reference tree:

##                   chunk  rf
## 235 uce-267_chunk_7.tre 106

In order of descending priority: bipartition support, tree length, root-to-tip variance

name root-to-tip_var treelength bipartition
uce-120_chunk_6.tre 0.187599 4.6936 0.0689655172414
uce-1317_chunk_8.tre 0.0166339 15.2225 0.0689655172414
uce-323_chunk_2.tre 0.325194 35.4206 0.0689655172414

In order of descending priority: root-to-tip variance, tree length, bipartition support

name root-to-tip_var treelength bipartition
uce-537_chunk_3.tre 0.0 8.71304e-07 0.00862068965517
uce-84_chunk_5.tre 0.0 9.84452e-07 0.0172413793103
uce-739_chunk_2.tre 0.0 2.2254e-06 0.0344827586207

In order of descending priority: root-to-tip variance, bipartition support, tree length

name root-to-tip_var treelength bipartition
uce-739_chunk_2.tre 0.0 2.2254e-06 0.0344827586207
uce-832_chunk_4.tre 0.0 1.05876 0.0258620689655
uce-84_chunk_5.tre 0.0 9.84452e-07 0.0172413793103

In order of descending priority: tree length, root-to-tip variance, bipartition support

name root-to-tip_var treelength bipartition
uce-537_chunk_3.tre 0.0 8.71304e-07 0.00862068965517
uce-84_chunk_5.tre 0.0 9.84452e-07 0.0172413793103
uce-739_chunk_2.tre 0.0 2.2254e-06 0.0344827586207

In order of descending priority: tree length, bipartition support, root-to-tip variance

name root-to-tip_var treelength bipartition
uce-537_chunk_3.tre 0.0 8.71304e-07 0.00862068965517
uce-84_chunk_5.tre 0.0 9.84452e-07 0.0172413793103
uce-739_chunk_2.tre 0.0 2.2254e-06 0.0344827586207

50-bp chunks from the 75%-complete dataset

A bash script used to split the UCE loci into 50-bp chunks was identical to that used above, with the exception of step 3 (joining all taxa represented by a given chunk into a new FASTA file), for which the following code was used:

library(dplyr)

# Alternating rows (name, sequence, name, sequence) go to two different columns, so that
# each sequence is correctly assigned to its respective taxon:

split_seqs <- read.table("split.txt")
odd <- as.vector(split_seqs[seq(1, nrow(split_seqs), 2), ])
even <- as.vector(split_seqs[seq(2, nrow(split_seqs), 2), ])
odd_name <- "taxon"
even_name <- "sequence"
split_seqs_new <- data.frame(odd, even)
names(split_seqs_new) <- c(odd_name, even_name)

# Determine how long the locus is (i.e., how many 50-bp chunks it has been split into).
# This can be done by counting the number of occurrences of a single taxon name. 
# In principle, any name could be used, but since no taxon appears to be common to all
# the loci, the code below grabs the first taxon name appearing in the fasta file and
# counts its occurrences.

m <- paste(sapply(strsplit(as.character(split_seqs_new[1,1]), split="_", fixed=TRUE),
                  function(x) (x[1])), 
           "_", sapply(strsplit(as.character(split_seqs_new[1,1]), split="_", fixed=TRUE),                         function(x) (x[2])),
           sep = "")
n <- length(unique(grep(m, split_seqs_new[,1], value = TRUE)))

# Create a vector of strings that can filter taxon names according to the base pair range
# tag attached to their end

chunks <- vector()
for(i in 0:(n-1)) {
  chunks <- append(chunks, paste("_", i*50, "bp", sep = ""))
}

# Create a list of data frames. Each element of the list represents a base pair range
# and consists of a data frame containing both the "taxon" and "sequence" columns of
# split_seqs_new

partition <- list()
for(i in 1:length(chunks)) {
partition[[i]] <- data.frame(filter(split_seqs_new, 
                                    grepl(as.character(chunks[i]), taxon)))
}

# Create a matrix whose columns represents individual chunks (i.e., base pair ranges)
# and whose rows have the structure of the original split_seqs data frame -- i.e., name,
# sequence, name, sequence:

chunkmatrix <- matrix(ncol = length(partition), 
                      nrow = 2*(nrow(split_seqs_new)/length(partition)))
for(i in 1:length(partition)) {
  for(j in 1:nrow(partition[[i]])) {
    chunkmatrix[(2*j - 1), i] <- as.character(partition[[i]][j, "taxon"])
    chunkmatrix[2*j, i] <- as.character(partition[[i]][j, "sequence"])
  }
}

# Print the resulting fasta files!

for(i in 1:ncol(chunkmatrix)) {
  write(chunkmatrix[,i], paste("chunk_", i, ".fasta", sep = ""))
}

The taxon names in the chunk FASTA files were then stripped of the chunk-indicating suffixes so as to correspond with the names used in the reference tree, and a script was used to analyze all the chunks using RAxML. The resulting trees were then copied into the directory containing the chunks. Out of 6543 chunk FASTA files, only 4237 had tree files associated with them, suggesting that RAxML failed to infer a tree for a given chunk in approx. 35% of cases.

Phase 1: get_var_length

python src/get_var_length.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-90-taxa/Chunks/ --flend .tre --outf Min-90-chunk_analysis/var-min-90-chunks --outg alepisaurus_ferox,ceratoscopelus_warmingii

Running the script produced 1029 warnings about either Alepisaurus ferox or Ceratoscopelus warmingii missing from the chunk tree. Despite this, the tree length and root-to-tip variance was computed for all the 4237 trees.

Phase 2: get_bp_genetrees

python src/get_bp_genetrees.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-90-taxa/Chunks/ /Users/David/Grive/Alfaro_Lab/Acanthomorpha/12_no_outgroups_scheme_3.tre --flend .tre --outf Min-90-chunk_analysis/bp-min-90-chunks

Phase 3: combine_results

python src/combine_results.py Min-90-chunk_analysis/var-min-90-chunks Min-90-chunk_analysis/bp-min-90-chunks --outf Min-90-chunk_analysis/comb-min-90-chunks

Phase 4: get_good_genes

In order of descending priority: bipartition support, root-to-tip variance, tree length

name root-to-tip_var treelength bipartition
uce-157_chunk_7.tre 0.176994 14.918 0.258620689655
uce-1244_chunk_1.tre 0.0534134 11.1586 0.224137931034
uce-126_chunk_3.tre 0.104056 14.0824 0.224137931034
## [1] "The Robinson-Foulds distances of the three best chunks from the reference tree are as follows:"
##                    chunk  rf
## 1229 uce-157_chunk_7.tre 164
##                    chunk  rf
## 822 uce-1244_chunk_1.tre 158
##                   chunk  rf
## 907 uce-126_chunk_3.tre 166

The chunk with the minimum RF distance from the reference tree:

##                  chunk  rf
## 6 uce-1000_chunk_6.tre 136

In order of descending priority: bipartition support, tree length, root-to-tip variance

name root-to-tip_var treelength bipartition
uce-157_chunk_7.tre 0.176994 14.918 0.258620689655
uce-1244_chunk_1.tre 0.0534134 11.1586 0.224137931034
uce-126_chunk_3.tre 0.104056 14.0824 0.224137931034

In order of descending priority: root-to-tip variance, tree length, bipartition support

name root-to-tip_var treelength bipartition
uce-956_chunk_5.tre 2.3002e-12 8.56621e-05 0.0
uce-393_chunk_3.tre 5.54758e-12 9.22463e-05 0.0
uce-921_chunk_4.tre 5.8998e-12 7.89216e-05 0.0

In order of descending priority: root-to-tip variance, bipartition support, tree length

name root-to-tip_var treelength bipartition
uce-956_chunk_5.tre 2.3002e-12 8.56621e-05 0.0
uce-393_chunk_3.tre 5.54758e-12 9.22463e-05 0.0
uce-921_chunk_4.tre 5.8998e-12 7.89216e-05 0.0

In order of descending priority: tree length, root-to-tip variance, bipartition support

name root-to-tip_var treelength bipartition
uce-921_chunk_4.tre 5.8998e-12 7.89216e-05 0.0
uce-1173_chunk_3.tre 1.96604e-11 8.52591e-05 0.0
uce-956_chunk_5.tre 2.3002e-12 8.56621e-05 0.0

In order of descending priority: tree length, bipartition support, root-to-tip variance

name root-to-tip_var treelength bipartition
uce-921_chunk_4.tre 5.8998e-12 7.89216e-05 0.0
uce-1173_chunk_3.tre 1.96604e-11 8.52591e-05 0.0
uce-956_chunk_5.tre 2.3002e-12 8.56621e-05 0.0

Step 1 of filtering: remove trees of zero length

No effect: the minimum observed tree length is non-zero and equal to:

## [1] 7.89216e-05
## Number of trees after filtering: 4237
## Percentage of trees after filtering: 100%

Step 2 of filtering: remove the shortest and longest 10% of trees:

## Original min. tree length: 7.89216e-05
## Original max. tree length: 214.974
## Min. tree length after step 2: 0.285338
## Max. tree length after step 2: 24.1424
## Number of trees after step 2: 3389
## Percentage of trees after step 2: 79.98584%

Step 3 of filtering: remove the most unclocklike third of loci

## Max. root-to-tip variance from step 2: 2.72349
## Max. root-to-tip variance after step 3: 0.00905244
## Number of trees after step 3: 2270
## Percentage of trees after step 3: 53.57564%

Step 4 of filtering: keep the loci in the upper 10% of bipartition support

## Min. bipartition support from step 3: 0
## Min. bipartition support after step 4: 0.1034483
## Number of trees after step 4: 170
## Percentage of trees after step 4: 4.012273%

50-bp chunks from the 75%-complete dataset: SH-like of 75% or above

The nodes with SH-like support values below 75% were collapsed using the script above. The corresponding FASTA files were copied into the resulting directory, and SortaDate was run as follows:

Phase 1: get_var_length

python src/get_var_length.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-90-taxa/Chunks_75/ --flend .tre --outf Min-90-chunk75_analysis/var-min-90-chunks75 --outg alepisaurus_ferox,ceratoscopelus_warmingii

Running the script resulted in occasional segmentation faults as well as “this really only works with nexus or newick” warning messages. The resulting file had the full number of lines (4237: one for each tree), but some of them were blank and others contained NAs.

Phase 2: get_bp_genetrees

python src/get_bp_genetrees.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-90-taxa/Chunks_75/ /Users/David/Grive/Alfaro_Lab/Acanthomorpha/12_no_outgroups_scheme_3.tre --flend .tre --outf Min-90-chunk75_analysis/bp-min-90-chunks75

Phase 3: combine_results

python src/combine_results.py Min-90-chunk75_analysis/var-min-90-chunks75 Min-90-chunk75_analysis/bp-min-90-chunks75 --outf Min-90-chunk75_analysis/comb-min-90-chunks75

Delete the lines with NAs:

comb90chunks75 <- read.table("/Users/David/Grive/Alfaro_Lab/SortaDate/Min-90-chunk75_analysis/comb-min-90-chunks75", fill = TRUE)
filtered <- na.omit(comb90chunks75)
write.table(filtered,
      "/Users/David/Grive/Alfaro_Lab/SortaDate/Min-90-chunk75_analysis/comb-min-90-chunks75-filtered",
      sep = "\t",
      quote = FALSE,
      row.names = FALSE,
      col.names = FALSE)
nrow(filtered)

The length of the filtered comb file is 3589 lines, meaning that SortaDate failed to obtain root-to-tip variance and/or tree length for 648 trees.

Phase 4: get_good_genes

In order of descending priority: bipartition support, root-to-tip variance, tree length

name root-to-tip_var treelength bipartition
uce-157_chunk_7.tre 0.163848 13.508 0.198275862069
uce-1244_chunk_1.tre 0.040797 10.3002 0.181034482759
uce-126_chunk_3.tre 0.101031 13.9331 0.181034482759
## [1] "The Robinson-Foulds distances of the three best chunks from the reference tree are as follows:"
##                    chunk  rf
## 1064 uce-157_chunk_7.tre 126
##                    chunk  rf
## 710 uce-1244_chunk_1.tre 111
##                   chunk  rf
## 788 uce-126_chunk_3.tre 114

The chunk with the minimum RF distance from the reference tree:

##                    chunk rf
## 2016 uce-463_chunk_2.tre 85

In order of descending priority: bipartition support, tree length, root-to-tip variance

name root-to-tip_var treelength bipartition
uce-157_chunk_7.tre 0.163848 13.508 0.198275862069
uce-1244_chunk_1.tre 0.040797 10.3002 0.181034482759
uce-126_chunk_3.tre 0.101031 13.9331 0.181034482759

In order of descending priority: root-to-tip variance, tree length, bipartition support

name root-to-tip_var treelength bipartition
uce-1175_chunk_4.tre 0.0 8.48388e-07 0.00862068965517
uce-547_chunk_2.tre 0.0 1.18678e-06 0.0172413793103
uce-815_chunk_8.tre 0.0 0.0202802 0.0

In order of descending priority: root-to-tip variance, bipartition support, tree length

name root-to-tip_var treelength bipartition
uce-67_chunk_4.tre 0.0 0.0214551 0.0603448275862
uce-200_chunk_2.tre 0.0 0.0317025 0.0344827586207
uce-525_chunk_3.tre 0.0 0.020706 0.0258620689655

In order of descending priority: tree length, root-to-tip variance, bipartition support

name root-to-tip_var treelength bipartition
uce-1175_chunk_4.tre 0.0 8.48388e-07 0.00862068965517
uce-547_chunk_2.tre 0.0 1.18678e-06 0.0172413793103
uce-815_chunk_8.tre 0.0 0.0202802 0.0

In order of descending priority: tree length, bipartition support, root-to-tip variance

name root-to-tip_var treelength bipartition
uce-1175_chunk_4.tre 0.0 8.48388e-07 0.00862068965517
uce-547_chunk_2.tre 0.0 1.18678e-06 0.0172413793103
uce-815_chunk_8.tre 0.0 0.0202802 0.0

Step 1 of filtering: remove trees of zero length

No effect: the minimum observed tree length is non-zero and equal to:

## [1] 8.48388e-07
## Number of trees after filtering: 3589
## Percentage of trees after filtering: 100%

Step 2 of filtering: remove the shortest and longest 10% of trees:

## Original min. tree length: 8.48388e-07
## Original max. tree length: 209.395
## Min. tree length after step 2: 0.124261
## Max. tree length after step 2: 19.679
## Number of trees after step 2: 2871
## Percentage of trees after step 2: 79.99443%

Step 3 of filtering: remove the most unclocklike third of loci

## Max. root-to-tip variance from step 2: 6.66986
## Max. root-to-tip variance after step 3: 0.00645959
## Number of trees after step 3: 1923
## Percentage of trees after step 3: 53.58038%

Step 4 of filtering: keep the loci in the upper 10% of bipartition support

## Min. bipartition support from step 3: 0
## Min. bipartition support after step 4: 0.06034483
## Number of trees after step 4: 169
## Percentage of trees after step 4: 4.708833%

50-bp chunks from the 75%-complete dataset: SH-like of 90% or above

Phase 1: get_var_length

python src/get_var_length.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-90-taxa/Chunks_90/ --flend .tre --outf Min-90-chunk90_analysis/var-min-90-chunks90 --outg alepisaurus_ferox,ceratoscopelus_warmingii

As in the previous case, the command led to a number of segfaults and “this really only works with nexus or newick” warnings, corresponding to lines in the resulting file that were either incomplete or included NAs.

Phase 2: get_bp_genetrees

python src/get_bp_genetrees.py /Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-90-taxa/Chunks_90/ /Users/David/Grive/Alfaro_Lab/Acanthomorpha/12_no_outgroups_scheme_3.tre --flend .tre --outf Min-90-chunk90_analysis/bp-min-90-chunks90

Phase 3: combine_results

python src/combine_results.py Min-90-chunk90_analysis/var-min-90-chunks90 Min-90-chunk90_analysis/bp-min-90-chunks90 --outf Min-90-chunk90_analysis/comb-min-90-chunks90

Delete the lines with NAs:

comb90chunks90 <- read.table("/Users/David/Grive/Alfaro_Lab/SortaDate/Min-90-chunk90_analysis/comb-min-90-chunks90", fill = TRUE)
filtered <- na.omit(comb90chunks90)
write.table(filtered,
      "/Users/David/Grive/Alfaro_Lab/SortaDate/Min-90-chunk90_analysis/comb-min-90-chunks90-filtered",
      sep = "\t",
      quote = FALSE,
      row.names = FALSE,
      col.names = FALSE)
nrow(filtered)

The length of the filtered comb file is 2319 lines, meaning that SortaDate failed to obtain root-to-tip variance and/or tree length for 1918 trees.

Phase 4: get_good_genes

In order of descending priority: bipartition support, root-to-tip variance, tree length

name root-to-tip_var treelength bipartition
uce-625_chunk_10.tre 0.0675117 0.706842 0.0775862068966
uce-1317_chunk_8.tre 0.0166339 15.2225 0.0689655172414
uce-120_chunk_6.tre 0.187599 4.6936 0.0689655172414
## [1] "The Robinson-Foulds distances of the three best chunks from the reference tree are as follows:"
##                     chunk  rf
## 1686 uce-625_chunk_10.tre 106
##                    chunk  rf
## 629 uce-1317_chunk_8.tre 111
##                   chunk  rf
## 429 uce-120_chunk_6.tre 114

The chunk with the minimum RF distance from the reference tree:

##                    chunk rf
## 678 uce-1340_chunk_2.tre 85

In order of descending priority: bipartition support, tree length, root-to-tip variance

name root-to-tip_var treelength bipartition
uce-625_chunk_10.tre 0.0675117 0.706842 0.0775862068966
uce-120_chunk_6.tre 0.187599 4.6936 0.0689655172414
uce-1317_chunk_8.tre 0.0166339 15.2225 0.0689655172414

In order of descending priority: root-to-tip variance, tree length, bipartition support

name root-to-tip_var treelength bipartition
uce-1175_chunk_4.tre 0.0 8.48388e-07 0.00862068965517
uce-675_chunk_5.tre 0.0 8.58914e-07 0.0
uce-537_chunk_3.tre 0.0 8.71304e-07 0.00862068965517

In order of descending priority: root-to-tip variance, bipartition support, tree length

name root-to-tip_var treelength bipartition
uce-1189_chunk_2.tre 0.0 1.52836 0.0431034482759
uce-739_chunk_2.tre 0.0 2.2254e-06 0.0344827586207
uce-67_chunk_4.tre 0.0 0.0214551 0.0344827586207

In order of descending priority: tree length, root-to-tip variance, bipartition support

name root-to-tip_var treelength bipartition
uce-1175_chunk_4.tre 0.0 8.48388e-07 0.00862068965517
uce-675_chunk_5.tre 0.0 8.58914e-07 0.0
uce-537_chunk_3.tre 0.0 8.71304e-07 0.00862068965517

In order of descending priority: tree length, bipartition support, root-to-tip variance

name root-to-tip_var treelength bipartition
uce-1175_chunk_4.tre 0.0 8.48388e-07 0.00862068965517
uce-675_chunk_5.tre 0.0 8.58914e-07 0.0
uce-537_chunk_3.tre 0.0 8.71304e-07 0.00862068965517

Step 1 of filtering: remove trees of zero length

No effect: the minimum observed tree length is non-zero and equal to:

## [1] 8.48388e-07
## Number of trees after filtering: 2319
## Percentage of trees after filtering: 100%

Step 2 of filtering: remove the shortest and longest 10% of trees:

## Original min. tree length: 8.48388e-07
## Original max. tree length: 141.225
## Min. tree length after step 2: 0.111172
## Max. tree length after step 2: 20.8823
## Number of trees after step 2: 1855
## Percentage of trees after step 2: 79.99138%

Step 3 of filtering: remove the most unclocklike third of loci

## Max. root-to-tip variance from step 2: 40.1753
## Max. root-to-tip variance after step 3: 0.00177898
## Number of trees after step 3: 1243
## Percentage of trees after step 3: 53.60069%

Step 4 of filtering: keep the loci in the upper 10% of bipartition support

## Min. bipartition support from step 3: 0
## Min. bipartition support after step 4: 0.03448276
## Number of trees after step 4: 66
## Percentage of trees after step 4: 2.846054%

Out of the 66 chunks selected from the SH-like > 0.9 dataset, 65 were 50-bp long. To facilitate PartitionFinder searches, these chunks were aligned first using SequenceMatrix v1.8, with the last (47-bp) chunk attached to the resulting alignment afterwards. This made it easier to automate the calculation of base pair ranges for each locus (the latter needed to be included in the PartitionFinder configuration file). The outgroups (Alepisaurus ferox and Ceratoscopelus warmingii) were excluded from the concatenated alignment, as they were not present in the available topology constraint.

PartitionFinder was first run with the “rcluster” search option. Although the “models” option was set to “all”, since rcluster searches can only be performed in RAxML, this effectively limited the analysis to the three models implemented in the latter software (GTR, GTR+\(\Gamma\), GTR+\(\Gamma\)+I). This run yielded 8 subsets of the following properties:

Subset Best Model Sites Rate under GTR+\(\Gamma\) ID
1 GTR+\(\Gamma\) 1047 1.048826 76dec7513c9b37738bfac30d5d512cf3
2 GTR+I+\(\Gamma\) 1100 0.672610 111ec939b8c23b0e4e2529cee50f387a
3 GTR+\(\Gamma\) 550 0.281570 00fea7ffef92b99d78df6650b56ebb0c
4 GTR+\(\Gamma\) 100 0.965800 065a20f09eb1fa7383dedb55df51e100
5 GTR+\(\Gamma\) 200 2.278670 a6afbf36fc27e68c089e09c31ca3c71f
6 GTR+\(\Gamma\) 50 1.682760 da54ec6f6966e6b696433ef124df9e1f
7 GTR+I+\(\Gamma\) 50 2.739838 2d0d981211d9b6e3f364c1fcd221b555
8 GTR+\(\Gamma\) 200 1.356237 fd9894d33a357dcdab10c55c5153eb16

In the next step, the alignment was analyzed under the “greedy” search option, with the BEAST model set and without the “–raxml” flag. This search recovered the following 14 partitions:

Subset Best Model Sites Tree size under the best model ID
1 HKY+\(\Gamma\)+X 100 6.07552 3f353939a600a6973a6a1e0608925da6
2 TRNEF+\(\Gamma\) 200 4.34326 ceefdb63d1de1122404ab11285f05f27
3 JC+\(\Gamma\) 50 2.80253 ac2a21040350ec54b4664bc752a13a45
4 TRNEF+\(\Gamma\) 300 1.53147 baed377a41689244a752bc9e3169c975
5 GTR+I+\(\Gamma\)+X 350 3.44224 ce05da9c4ae84b4a74cb468b18fcda82
6 K80+I+\(\Gamma\) 450 3.71596 16b9f71868c90950d0dd144fd5de511f
7 SYM+\(\Gamma\) 300 8.15389 c5076d62640bd9eeabe53de6e5bf7f7a
8 K80+\(\Gamma\) 150 9.20524 d5aeca1d1fad19fc89dbb336bad98cbd
9 K80+\(\Gamma\) 497 5.13898 e7f992b256534832ccdb46c09aa6e9a4
10 HKY+\(\Gamma\)+X 300 2.87180 4e722846245f8cd6600c32f3131120a6
11 TRNEF+\(\Gamma\) 250 5.87708 72ad9849e1f3be2cbffa1097935e00fc
12 GTR+I+\(\Gamma\)+X 50 36.99851 2d0d981211d9b6e3f364c1fcd221b555
13 JC+\(\Gamma\) 150 1.20339 d687ea29d8ab88ed27334eafc8164ec8
14 TRNEF+\(\Gamma\) 150 13.44953 00f862e3cabdf1be1b99bc5b1455ba78

(In PhyML, “tree size” denotes the sum of edge lengths: see http://github.com/stephaneguindon/phyml/blob/master/doc/phyml-manual.tex. “X” denotes estimated, as opposed to empirical, base frequencies.)

BEAST analysis

The two 50-bp partitions were removed. Each partition received its own substitution model, which was generally identical to that selected by PartitionFinder, except for those models that contained the I parameter (proportion of invariant sites). Since this parameter and \(\Gamma\) account for the same phenomenon (rate heterogeneity across sites), their simultaneous inclusion causes the resulting model to be non-identifiable, leading to potential mixing problems in MCMC simulations. The models that were not directly available in BEAUTi were implemented as follows:

  • TrNef: TN93 + equal base frequencies
  • K80: HKY85 + equal base frequencies
  • SYM: GTR + equal base frequencies

Each \(\Gamma\) rate heterogeneity distribution was discretized into 4 categories. The default improper prior on the relative rates (the allMus parameter) was set to a gamma distribution with a shape of 0.001 and a scale of 1000 following the recommendations at https://www.biostat.washington.edu/sites/default/files/modules//2016_SISMID_13_11.pdf. All the priors on substitution model parameters were kept at their default values.

In contrast to the substitution models, the parameters of the uncorrelated lognormal clock model were linked across partitions. The ucld.mean hyperparameter was assigned a lognormal hyperprior with a mean of 0.005 (in real space) and a log standard deviation of 1, with the initial value set equal to 0.005. A truncated exponential hyperprior with support (0, 1), a mean of 0.3, and an initial value of 0.1 was used for ucld.stdev.

The “fixed tree topology” operator mix was used (based on a user-supplied topology common to all partitions), with the tuning of the ucld.mean and ucld.stdev operators set to 0.9 (default value = 0.75) and their weight doubled (from 3.0 to 6.0). Default tuning values and weights were used for all the remaining operators.

The 12 internal calibrations were all implemented as exponentials, and the (Polymixia + Aphredoderus) received the corrected calibration whose 95th percentile was equal to 116.35 Ma. A truncated exponential distribution supported on (98 Ma, 143 Ma) was constructed to calibrate the root of the tree, with its mean equal to the midpoint of the support interval (22.5 without the offset).

Finally, the MCMC simulation was run for 500 million generations, with a sampling period of 1000 generations.

beast -threads 12 -beagle_SSE concatchunks.xml

Two more analyses were performed using BEAST; their settings were identical to those described above except for the clock model used. In both cases, a random local clock model was utilized, with one analysis using the uncorrelated version of the model (with the ratesAreMultipliers option in the XML file set to the default value of "false") and the other employing the correlated version (ratesAreMultipliers="true"). In both analyses, default priors were placed on the parameters of the clock model.

beast -threads 6 -beagle_SSE concatchunks-urlc.xml
beast -threads 6 -beagle_SSE concatchunks-crlc.xml

The analysis finished after 12.597 days and the resulting log file was examined using LogAnalyser to determine the EES value of each parameter:

java -Xmx6000m -cp /home/analysis/.linuxbrew/lib/beast.jar dr.app.tools.LogAnalyser -burnin 50000000 -ess concatchunks.log
##                        statistic          median         ESS
## 1                      posterior -48625.69180000   3413.0600
## 2                          prior  -2049.74610000    844.2895
## 3                     likelihood -46575.83370000  32241.1992
## 4           treeModel.rootHeight    137.72570000   2238.3418
## 5              tmrca(Aipichthys)    120.61650000   1324.2513
## 6           tmrca(Berybolcensis)     55.65810000   6036.6497
## 7               tmrca(Calatomus)     23.59360000    718.9452
## 8          tmrca(Chaetodontidae)     37.91700000   6567.4848
## 9           tmrca(Eastmanalepes)     50.96060000  16986.7366
## 10            tmrca(Eobuglossus)     43.01970000   8226.7418
## 11            tmrca(Eucoelopoma)     56.77560000   5699.0608
## 12         tmrca(Homonotichthys)     98.71850000   1207.3403
## 13           tmrca(Mcconichthys)     66.97340000    688.1744
## 14                   tmrca(Mene)     59.19900000   4229.6624
## 15         tmrca(Ramphexocoetus)     53.71620000   3420.8121
## 16                   tmrca(Root)    137.72570000   2238.3418
## 17                 tmrca(Tarkus)     49.83070000  13887.1125
## 18     birthDeath.meanGrowthRate      0.01870000   3785.5560
## 19  birthDeath.relativeDeathRate      0.05030000 125120.0000
## 20            Subset1HKYGX.kappa      3.30330000 336740.0000
## 21     Subset1HKYGX.frequencies1      0.38220000 160000.0000
## 22     Subset1HKYGX.frequencies2      0.23020000 194740.0000
## 23     Subset1HKYGX.frequencies3      0.15200000 188220.0000
## 24     Subset1HKYGX.frequencies4      0.23390000 191870.0000
## 25            Subset1HKYGX.alpha      0.98970000 344070.0000
## 26          Subset2TRNEFG.kappa1      4.82390000 339050.0000
## 27          Subset2TRNEFG.kappa2      1.81510000 353590.0000
## 28           Subset2TRNEFG.alpha      0.67370000 343600.0000
## 29          Subset3TRNEFG.kappa1      3.05060000 330620.0000
## 30          Subset3TRNEFG.kappa2      4.86070000 327930.0000
## 31           Subset3TRNEFG.alpha      0.46710000 335870.0000
## 32              Subset4GTRIGX.ac      0.17860000 142440.0000
## 33              Subset4GTRIGX.ag      0.41800000  69343.9763
## 34              Subset4GTRIGX.at      0.10600000 158030.0000
## 35              Subset4GTRIGX.cg      0.34480000 106840.0000
## 36              Subset4GTRIGX.gt      0.14930000 132440.0000
## 37    Subset4GTRIGX.frequencies1      0.30180000 109180.0000
## 38    Subset4GTRIGX.frequencies2      0.21230000  87820.5730
## 39    Subset4GTRIGX.frequencies3      0.31520000  91691.2293
## 40    Subset4GTRIGX.frequencies4      0.16960000  98336.1071
## 41           Subset4GTRIGX.alpha      0.44590000 276290.0000
## 42            Subset5K80IG.kappa      3.56400000 364980.0000
## 43            Subset5K80IG.alpha      0.31900000 370210.0000
## 44                Subset6SYMG.ac      0.31450000 233040.0000
## 45                Subset6SYMG.ag      1.61400000 171440.0000
## 46                Subset6SYMG.at      0.24760000 241230.0000
## 47                Subset6SYMG.cg      1.23840000 197650.0000
## 48                Subset6SYMG.gt      0.24490000 285190.0000
## 49             Subset6SYMG.alpha      0.52120000 361230.0000
## 50             Subset7K80G.kappa      2.93730000 365130.0000
## 51             Subset7K80G.alpha      0.70830000 361470.0000
## 52             Subset8K80G.kappa      3.17480000 373600.0000
## 53             Subset8K80G.alpha      1.22040000 364000.0000
## 54            Subset9HKYGX.kappa      3.59770000 358280.0000
## 55     Subset9HKYGX.frequencies1      0.20910000 185070.0000
## 56     Subset9HKYGX.frequencies2      0.25900000 189260.0000
## 57     Subset9HKYGX.frequencies3      0.30580000 174970.0000
## 58     Subset9HKYGX.frequencies4      0.22510000 192860.0000
## 59            Subset9HKYGX.alpha      0.62780000 344070.0000
## 60         Subset10TRNEFG.kappa1      4.17550000 337400.0000
## 61         Subset10TRNEFG.kappa2      6.02560000 342470.0000
## 62          Subset10TRNEFG.alpha      0.50600000 338290.0000
## 63             Subset11JCG.alpha      0.38770000 282140.0000
## 64         Subset12TRNEFG.kappa1      3.08980000 324920.0000
## 65         Subset12TRNEFG.kappa2      7.49850000 321000.0000
## 66          Subset12TRNEFG.alpha      0.33910000 220060.0000
## 67               Subset1HKYGX.mu      1.17860000 183520.0000
## 68              Subset2TRNEFG.mu      0.86830000 155210.0000
## 69              Subset3TRNEFG.mu      0.28780000 112480.0000
## 70              Subset4GTRIGX.mu      0.71110000 105770.0000
## 71               Subset5K80IG.mu      0.81240000  89865.7064
## 72                Subset6SYMG.mu      1.65800000  86079.4311
## 73                Subset7K80G.mu      1.92930000 131010.0000
## 74                Subset8K80G.mu      1.05670000  92400.6821
## 75               Subset9HKYGX.mu      0.59180000 131650.0000
## 76             Subset10TRNEFG.mu      1.18620000 132450.0000
## 77                Subset11JCG.mu      0.20350000 110950.0000
## 78             Subset12TRNEFG.mu      2.54390000  79083.6503
## 79                     ucld.mean      0.00073594    332.5618
## 80                    ucld.stdev      0.98430000   2649.2935
## 81                      meanRate      0.00068979    700.5540
## 82        coefficientOfVariation      1.27000000    226.7723
## 83                    covariance      0.23290000    433.6862
## 84   Subset1HKYGX.treeLikelihood  -1890.61200000  36770.2075
## 85  Subset2TRNEFG.treeLikelihood  -2931.73400000  35236.5308
## 86  Subset3TRNEFG.treeLikelihood  -2228.42960000  36860.1607
## 87  Subset4GTRIGX.treeLikelihood  -4743.64810000  29583.0309
## 88   Subset5K80IG.treeLikelihood  -5280.81190000  32435.6649
## 89    Subset6SYMG.treeLikelihood  -5733.31740000  32350.1559
## 90    Subset7K80G.treeLikelihood  -3537.00090000  26073.9083
## 91    Subset8K80G.treeLikelihood  -8747.29310000  18666.6737
## 92   Subset9HKYGX.treeLikelihood  -3646.88110000  24260.8424
## 93 Subset10TRNEFG.treeLikelihood  -4255.05270000  39038.3017
## 94    Subset11JCG.treeLikelihood   -949.70740000   9002.7411
## 95 Subset12TRNEFG.treeLikelihood  -2630.61690000  25585.5111
## 96                   branchRates      0.00000000   1000.0000
## 97                    speciation   -580.10770000    566.5915

Since all the ESS values exceeded 200, the sample from the posterior was summarized using TreeAnnotator:

java -Xmx20000m -cp /home/analysis/.linuxbrew/lib/beast.jar dr.app.tools.TreeAnnotator -burnin 50000000 -heights median concatchunks.trees chunks-BEAST.tre
Comparison with the MCMCTree-generated tree from the manuscript

Comparison with the MCMCTree-generated tree from the manuscript

MCMCTree analysis

In PAML, subsets of a concatenated alignment cannot be simply ignored as in BEAST but must be removed from the file, so that the sum of partition lengths equals the total number of nucleotides (PAML manual: p. 13). Moreover, the sites that make up a partition must be adjacent in the alignment. Therefore, the following information from the best_scheme file generated by PartitionFinder was used to generate a PAML-compatible alignment:

1 locus42, locus1
2 locus57, locus3, locus25, locus2
3 locus4
4 locus61, locus5, locus55, locus63, locus23, locus47
5 locus16, locus54, locus45, locus31, locus7, locus64, locus6
6 locus15, locus24, locus8, locus11, locus53, locus52, locus51, locus20, locus26
7 locus14, locus17, locus9, locus27, locus22, locus28
8 locus10, locus21, locus13
9 locus62, locus37, locus59, locus29, locus12, locus58, locus46, locus66, locus44, locus18
10 locus40, locus39, locus49, locus19, locus32, locus38
11 locus35, locus60, locus33, locus43, locus30
12 locus34
13 locus36, locus48, locus65
14 locus56, locus50, locus41
  1. Using bash, replace whitespaces between each taxon name and the corresponding sequence with line breaks:

    xargs -n 1 < concat.phy > concat2.phy
  2. Manually remove the first line indicating the number of taxa and sites.

  3. Now, use the structure of the new file (with names and sequences in alternating rows) to reorganize the chunks:

    concat <- read.table("/Users/David/Downloads/concat2.phy", stringsAsFactors = FALSE)
    
    # Random check, part 1: print "locus 42" (sites 2051--2100) of the first taxon:
    substring(concat[2,], 2051, 2100)
    
    for(i in seq(2, nrow(concat), by = 2)) {
      chunks <- substring(concat[i,], seq(1, 3250, 50), seq(50, 3250, 50))
      chunks[66] <- substring(concat[i,], 3251, 3297)
      ch1 <- paste(chunks[42], chunks[1], sep = "")
      ch2 <- paste(chunks[57], chunks[3], chunks[25], chunks[2], sep = "")
      ch3 <- paste(chunks[61], chunks[5], chunks[55], chunks[63], chunks[23], chunks[47], sep = "")
      ch4 <- paste(chunks[16], chunks[54], chunks[45], chunks[31], chunks[7], chunks[64], chunks[6], sep = "")
      ch5 <- paste(chunks[15], chunks[24], chunks[8], chunks[11], chunks[53], chunks[52], chunks[51], chunks[20], chunks[26], sep = "")
      ch6 <- paste(chunks[14], chunks[17], chunks[9], chunks[27], chunks[22], chunks[28], sep = "")
      ch7 <- paste(chunks[10], chunks[21], chunks[13], sep = "")
      ch8 <- paste(chunks[62], chunks[37], chunks[59], chunks[29], chunks[12], chunks[58], chunks[46], chunks[66], chunks[44], chunks[18], sep = "")
      ch9 <- paste(chunks[40], chunks[39], chunks[49], chunks[19], chunks[32], chunks[38], sep = "")
      ch10 <- paste(chunks[35], chunks[60], chunks[33], chunks[43], chunks[30], sep = "")
      ch11 <- paste(chunks[36], chunks[48], chunks[65], sep = "")
      ch12 <- paste(chunks[56], chunks[50], chunks[41], sep = "")
      concat[i,] <- paste(ch1, ch2, ch3, ch4, ch5, ch6, ch7, ch8, ch9, ch10, ch11, ch12, sep = "")
    }
    
    # Make sure that the new sequence rows have the desired length:
    for(i in seq(2, nrow(concat), by = 2)) {
      print(nchar(concat[i,]))
    }
    
    # Random check, part 2: "locus 42" should now correspond to sites 1--50. Does it?
    substring(concat[2,], 1, 50)
    
    # Yes! Now print the new alignment into a table:
    write.table(concat,
      "/Users/David/Grive/Alfaro_Lab/Acanthomorpha/sate-gblocks-clean-min-90-taxa/Concat/pamlconcat.phy",
      quote = FALSE,
      row.names = FALSE,
      col.names = FALSE)
  4. Add the following two lines to the beginning of the file:

    118 3197 G
    G 12 100 200 300 350 450 300 150 497 300 250 150 150

Unlike BEAST, PAML cannot assign a separate substitution model to each partition, but it is capable of unlinking substitution model parameters across partitions (Warnock et al. 2014: ESM p. 2). Since moderate substitution model overparameterization usually does not pose a problem to Bayesian phylogenetic analyses (Ronquist & Deans 2010), each partition was assigned its own GTR+\(\Gamma\) (“REV”) model. Note that the unlinking of substitution models necessitates the use of empirical (nhomo = 0) rather than estimated base frequencies. Relative rate, equilibrium frequency, and alpha parameters were unlinked across partitions, but branch lengths were kept linked (options Mgene = 4 and Malpha = 1). To facilitate cross-platform comparisons, the root calibration was set to 120.5 Ma (same as the mean of the root prior used in BEAST) in baseml to calculate substitution rates.

The substitution rate estimation finished up in 5:46:48 and yielded the following values:

Partition Rate (subst. per 100 million years)
Gene 1 0.126608
Gene 2 0.087695
Gene 3 0.033200
Gene 4 0.072678
Gene 5 0.072362
Gene 6 0.158487
Gene 7 0.188372
Gene 8 0.107932
Gene 9 0.060550
Gene 10 0.111402
Gene 11 0.026772
Gene 12 0.184536

For MCMCTree, the G option cannot be used, and the partitions must be given as multiple alignments one after another in a single file (http://groups.google.com/forum/#!topic/pamlsoftware/cC7mOgZnNiY). Such a file was compiled as follows:

  1. Run the following code:

    concat <- read.table("/Users/David/Downloads/concat2.phy", stringsAsFactors = FALSE)
    
    alignments <- matrix(nrow = 12*(nrow(concat) + 2), ncol = 1)
    for(i in 1:(nrow(concat)/2)) {
        for(j in 0:11) {
            alignments[(2*i-1) + j*(nrow(concat) + 2), ] <- concat[(2*i-1),]
        }
      chunks <- substring(concat[2*i,], seq(1, 3250, 50), seq(50, 3250, 50))
      chunks[66] <- substring(concat[2*i,], 3251, 3297)
      alignments[2*i,] <- paste(chunks[42], chunks[1], sep = "")
      alignments[2*i + 1*(nrow(concat) + 2),] <- paste(chunks[57], chunks[3], chunks[25], chunks[2], sep = "")
      alignments[2*i + 2*(nrow(concat) + 2),] <- paste(chunks[61], chunks[5], chunks[55], chunks[63], chunks[23], chunks[47], sep = "")
      alignments[2*i + 3*(nrow(concat) + 2),] <- paste(chunks[16], chunks[54], chunks[45], chunks[31], chunks[7], chunks[64], chunks[6], sep = "")
      alignments[2*i + 4*(nrow(concat) + 2),] <- paste(chunks[15], chunks[24], chunks[8], chunks[11], chunks[53], chunks[52], chunks[51], chunks[20], chunks[26], sep = "")
      alignments[2*i + 5*(nrow(concat) + 2),] <- paste(chunks[14], chunks[17], chunks[9], chunks[27], chunks[22], chunks[28], sep = "")
      alignments[2*i + 6*(nrow(concat) + 2),] <- paste(chunks[10], chunks[21], chunks[13], sep = "")
      alignments[2*i + 7*(nrow(concat) + 2),] <- paste(chunks[62], chunks[37], chunks[59], chunks[29], chunks[12], chunks[58], chunks[46], chunks[66], chunks[44], chunks[18], sep = "")
      alignments[2*i + 8*(nrow(concat) + 2),] <- paste(chunks[40], chunks[39], chunks[49], chunks[19], chunks[32], chunks[38], sep = "")
      alignments[2*i + 9*(nrow(concat) + 2),] <- paste(chunks[35], chunks[60], chunks[33], chunks[43], chunks[30], sep = "")
      alignments[2*i + 10*(nrow(concat) + 2),] <- paste(chunks[36], chunks[48], chunks[65], sep = "")
      alignments[2*i + 11*(nrow(concat) + 2),] <- paste(chunks[56], chunks[50], chunks[41], sep = "")
    }
    write.table(alignments, "/Users/David/Downloads/pamlconcatpartitioned.phy", quote = FALSE, row.names = FALSE, col.names = FALSE, na = "")
  2. Manually add the information about the number of taxa and sites in each partition.

To obtain a single estimate that could be used for the gamma-Dirichlet hyperprior on rates (rgene_gamma), a weighted average of these values was computed, with each rate weighted by the length of the corresponding partition:

## [1] 0.09550072

The shape parameter of the gamma-Dirichlet distribution (\(\alpha\)) was set to 2 and the rate parameter (\(\beta\)) was chosen so that the mean (calculated as \(\frac{\alpha}{\beta}\)) was equal to the rate above (expressed as the number of substitutions per 10 million years):

## [1] 209.4225

The hyperprior on the mean of the rate distribution is distributed as follows:

The mean of the gamma hyperprior on the variance of the log rate was set to 0.1 by setting \(\alpha\) equal to 1 and \(\beta\) equal to 10. This corresponds closely to the mean of the ucld.stdev hyperprior in BEAST (0.3 – note that while BEAST places the prior on the standard deviation of the rate distribution, MCMCTree assigns the prior to the variance, or the square of the standard deviation).

The lognormal distribution of rates is plotted below, with the mean and variance (in log-space) set equal to the means of the respective hyperpriors:

Finally, since there are multiple loci, the Dirichlet concentration parameter \(\alpha_D\) was specified and set to the default value of 1, which is described as producing “a reasonable partitioning” in the MCMCTree manual.

The full configuration file is shown below:

          seed = -1
       seqfile = pamlconcatpartitioned.phy
      treefile = 12_cali_no_outgroups_corrected.tre
       outfile = chunks.txt

         ndata = 12                            * Number of partitions
       seqtype = 0                             * Data type: nucleotides
       usedata = 3                             * Store the Hessian matrix for approximate likelihood computation in out.BV
         clock = 2                             * Uncorrelated lognormal relaxed clock
       RootAge = 'B(9.8, 14.3, 1e-300, 0.05)'  * P of less than 98 Ma = 10^(-300) and P of more than 143 Ma = 0.05

         model = 7                             * GTR
         alpha = 0.1                           * Following Alfaro et al.
         ncatG = 8                             * Following Alfaro et al.

     cleandata = 0                             * Do not remove sites with ambiguity

       BDparas = 0.1 0.1 00.1                  * Birth, death, sampling: following Alfaro et al.
   kappa_gamma = 6 2                           * No effect since usedata = 3
   alpha_gamma = 1 1                           * No effect since usedata = 3

   rgene_gamma = 2 209.42 1                    * Gamma-Dirichlet prior on mean rate: estimated using baseml under strict clock
  sigma2_gamma = 1 10 1                        * Gamma-Dirichlet prior on log rate variance

      finetune = 1: .1 .1 .1 .1 .1 .1          * Auto finetune: times, musigma2, rates, mixing, paras, FossilErr

         print = 2                             * Print branch rates into an output file
        burnin = 500000                        * Following Alfaro et al.
      sampfreq = 500                           * Following Alfaro et al.
       nsample = 15000                         * Following Alfaro et al.

The initial analyses (run under the usedata = 3 option to calculate the Hessian matrices for the branch lengths) produced the following warning messages:

xmax = 0.0000e+00 close to zero at 226!
xmax = 0.0000e+00 close to zero at 225!
xmax = 0.0000e+00 close to zero at 223!
xmax = 0.0000e+00 close to zero at 224!

However, these did not cause baseml to crash, and the Hessian matrices were successfully written into out.BV files. Therefore, the usedata variable was set to 2, the out.BV files were moved to in.BV, and four separate MCMC chains with a length of 75 million generations (as specified in the configuration file above) were started.

Refs:

Ronquist F, Deans AR 2010 Bayesian phylogenetics and its influence on insect systematics. Annu Rev Entomol 55: 189–206

Warnock RCM, Parham JF, Joyce WG, Lyson TR, Donoghue PCJ 2014 Calibration uncertainty in molecular dating analyses: there is no substitute for the prior evaluation of time priors. Proc R Soc B 282(1798): 20141013